
### Please cite Salas, Ricardo and Young, Kevin. 2024. "Where Did the Global Elite Go to School", Global Networks, forthcoming [or revise to issue date once published...]

### Set working directory to wherever you have downloaded the main data file...
setwd("[wherevermyworkingdirectoryis]")

### Calling a few packages that will be used...
library(haven)
library(dplyr)
library(openxlsx)

### Bring in the edgelist used for analysis...
file_path <- "GlobalEliteEduc_Edgelist.dta"
BigEliteEdge_RAW <- read_dta(file_path)
nrow(BigEliteEdge_RAW)
head(BigEliteEdge_RAW)
View(BigEliteEdge_RAW)

### Set key parameter of whether US nationals are included in the analysis...
## This object is important because it informs the names of files printed later on in this R script...
US_INCLUDED_OR_NOT <- "US INCLUDED"
#
########################################################################
# ### IMPORTANT: Here is the place to REMOVE THE US NATIONALS####
# ### Un-comment this block of code if you want to re-run the analysis excluding US nationals...Otherwise, for the full dataset, keep this commented off.
########################################################################
# BigEliteEdge_RAW <-  BigEliteEdge_RAW[which(BigEliteEdge_RAW$American !=1),]
# nrow(BigEliteEdge_RAW)
# ### Here is where we indicate whether we excluded the US (overwrites the default)
# US_INCLUDED_OR_NOT <- "US EXCLUDED"
# #####
########################################################################

#### Select relevant variables that are needed to build the bipartite network....
BigEliteEdge <- BigEliteEdge_RAW  %>% select(NAME, REPLACED_attribute)

### Convert edgelist to network object...
library(igraph)
BigNet_Bipartite <- graph.data.frame(BigEliteEdge, directed=F)
vcount(BigNet_Bipartite)
ecount(BigNet_Bipartite)

E(BigNet_Bipartite)$weight <- 1


#### Identifying Bipartite Structure ####
V(BigNet_Bipartite)$type <- bipartite.mapping(BigNet_Bipartite)$type

#### Using bipartite structure to colour nodes differently...
V(BigNet_Bipartite)$mode_colour <- ifelse(V(BigNet_Bipartite)$type ==TRUE, "blue", "orange")
table(V(BigNet_Bipartite)$type)

### Bipartite mapping...
V(BigNet_Bipartite)$type <- bipartite.mapping(BigNet_Bipartite)$type
### Bipartite projection
BigNet_UNIs <- bipartite.projection(BigNet_Bipartite,  type=V(BigNet_Bipartite)$type)$proj2

### Taking a look at the weights of the one-mode/unipartite network...
table(E(BigNet_UNIs)$weight)


#### Producing Ranked Degree Distribution ####
dc <- as.data.frame(strength(BigNet_UNIs, weights=E(BigNet_UNIs)$weight))
table(E(BigNet_UNIs)$weight)
head(dc)
colnames(dc)[1] <- paste0("strength_degree")
dc$rank <- rank(-dc, ties.method="random")
colnames(dc)[2] <- paste0("rank")
dc_ranked <- dc[order(dc$rank),] 
dc_ranked$attribute <- row.names(dc_ranked)
head(dc_ranked, 20)

nrow(dc_ranked)
### Obtain quintiles of degree distribution...
dc_ranked$ntile <- ntile(dc_ranked$strength_degree, 100)
table(dc_ranked$ntile)


### Implementing the 'fracture test' method, described in Appendix of paper...
#### Begin the Fracture Test loop here! ####
nrow(dc_ranked)
dc_ranked$fracture <- 999
nr_i <- nrow(dc_ranked)
# nr_i <- 10

for(i in 1:nr_i) {       

# ### Sample 1 at random!!!!!
# sampled_row <- dc_ranked %>% sample_n(1)

sampled_row <-  dc_ranked[which(dc_ranked$rank==i),]
sampled_attribute <- unique(sampled_row$attribute)
sampled_attribute 

net_less_chosen_ntile <- delete.vertices(BigNet_UNIs, (V(BigNet_UNIs)$name %in% sampled_attribute)==TRUE)
### Should be only one node difference...
vcount(BigNet_UNIs) - vcount(net_less_chosen_ntile)

### Baselines of unedited network
cl <- clusters(BigNet_UNIs)
nr_clusters_try <- cl$no

### Counting components...
cl <- clusters(net_less_chosen_ntile)
nr_clusters_net_less_chosen_ntile <- cl$no

### Put it into the dataframe...
dc_ranked$fracture[dc_ranked$attribute==sampled_attribute] <- nr_clusters_net_less_chosen_ntile

}
#### End Loop Here #####

table(dc_ranked$fracture)
head(dc_ranked)
# View(dc_ranked)
dev.off()

### Calculating betweenness centrality here....
bw <- as.data.frame(betweenness(BigNet_UNIs, weights=E(BigNet_UNIs)$weight))
colnames(bw)[1] <- "Betweeness"
bw$attribute <- row.names(bw)
# hist(bw)

### Merging dc and betweenness here....
dc_ranked_bw <- merge(dc_ranked, bw, by="attribute")
head(dc_ranked_bw)


## Eigenvector Centrality ####
evcentstoreW <-as.data.frame(evcent(BigNet_UNIs, directed=F)$vector)
colnames(evcentstoreW)[1] <- "evcent"
library(tibble)
evcentstoreW <- tibble::rownames_to_column(evcentstoreW, "attribute")
head(evcentstoreW)


### Merging simple net stat lists here....
dc_ranked_bw_cl <- merge(dc_ranked_bw, evcentstoreW, by="attribute")
head(dc_ranked_bw_cl)

### Kcores
kcore <- as.data.frame(coreness(BigNet_UNIs))
colnames(kcore)[1] <- "kcore"
kcore$attribute <- row.names(kcore)
head(kcore)

### Merging dc and kcore here....
dc_ranked_bw_cl <- merge(dc_ranked_bw, kcore, by="attribute")
head(dc_ranked_bw_cl)


### Analysis of Correlations Across Variables...

### Simple bivariate correlations...
cor(dc_ranked_bw_cl$Betweeness, dc_ranked_bw_cl$strength_degree)
cor(dc_ranked_bw_cl$Betweeness, dc_ranked_bw_cl$fracture)
cor(dc_ranked_bw_cl$strength_degree, dc_ranked_bw_cl$kcore)

# install.packages("PerformanceAnalytics")
library(PerformanceAnalytics)

#### Select relevant variables....
head(dc_ranked_bw_cl)
my_data <- dc_ranked_bw_cl  %>% select(strength_degree, fracture, Betweeness, kcore)
head(my_data)
my_data$Betweeness <- log(1 + my_data$Betweeness)
my_data$strength_degree <- log(1 + my_data$strength_degree)
head(my_data)

### Simple Correlation Matrix
res <- cor(my_data)
round(res, 2)

pdf("Analysis Modules/Analysis 4 - Fracturing Tests and Network Stats/Correlations in Node-Level Measures.pdf")
chart.Correlation(my_data, histogram=TRUE, pch=19, title="Correlations in Node-Level Measures")
dev.off()


### Re-Rank the rows based on strength degree (only including educational institutions)
dc_ranked_bw_cl$rank <- rank(-dc_ranked_bw_cl$strength_degree, ties.method="random")

dc_ranked_bw_cl$label_by_fracture <- ifelse(dc_ranked_bw_cl$fracture>1, dc_ranked_bw_cl$attribute, "")


### Save output 
save(dc_ranked_bw_cl, file="Analysis Modules/Results Across Analyses/dc_ranked_bw_cl.RData")




### Plotting network metrics here ######
library(ggplot2)
library(ggrepel)
library(gridExtra)


### Parameter here for font size on the ggplot! ####
fontlabsize <- 1.8

df <- dc_ranked_bw_cl
### Reduce labels to just the top N for the purpose of plotting
df$attribute <- ifelse(df$rank<=15, df$attribute, "")
unique(df$attribute)



### Scatterplot of results... strength degree
scatter1 <- ggplot(df) +
  geom_point(aes(rank, strength_degree), size = .5, color='dark green')+
  labs (x = "Strength Rank", y = "Strength Degree", title="Strength Degree") +   theme_classic() +
  geom_label_repel(data=df, size=fontlabsize, aes(rank, strength_degree, label = attribute), box.padding   = 0.9, point.padding = 0.8, segment.color = 'blue', max.overlaps = 100) 
######

dev.off()

### Scatterplot of results... betweenness...
scatter2 <- ggplot(df) +
  geom_point(aes(rank, Betweeness), size = .5, color='dark green')+
  labs (x = "Strength Rank", y = "Betweeness Score", title="Betweenness") +   theme_classic() +
  geom_label_repel(data=df, size=fontlabsize, aes(rank, Betweeness, label = attribute), box.padding   = 0.9, point.padding = 0.8, segment.color = 'blue', max.overlaps = 100) 
######

### Scatterplot of results...kciore....
scatter3 <- ggplot(df) +
  geom_point(aes(rank, kcore), size = .5, color='dark green')+
  labs (x = "Strength Rank", y = "Kcore", title="Kcore") +   theme_classic() +
  geom_label_repel(data=df, size=fontlabsize, aes(rank, kcore, label = attribute), box.padding   = 0.9, point.padding = 0.8, segment.color = 'blue', max.overlaps = 100) 
######

### Scatterplot of results... fracture score..
scatter4 <- ggplot(df) +
  geom_point(aes(rank, fracture), size = .5, color='dark green')+
  labs (x = "Strength Rank", y = "Fracture Score", title="Fracturing") +   theme_classic() +
  geom_label_repel(data=df, size=fontlabsize, aes(rank, fracture, label = label_by_fracture), box.padding   = 0.9, point.padding = 0.8, segment.color = 'blue', max.overlaps = 70) 
######



pdf(paste0("Analysis Modules/Analysis 4 - Fracturing Tests and Network Stats/Scatterplots of 4 Node Distributions - All Educ_", US_INCLUDED_OR_NOT , ".pdf"))
require(gridExtra)
ga1 <- grid.arrange(scatter1, scatter2,ncol=1, nrow=2, top = "")
ga1
ga2 <- grid.arrange(scatter3, scatter4,ncol=1, nrow=2, top = "")
ga2
dev.off()



#### Generating network visualisations ####
### Have a look at the strong ties...
edgy <- cbind(get.edgelist(BigNet_UNIs), E(BigNet_UNIs)$weight)
head(edgy)
class(edgy)
edgy <- as.data.frame(edgy)
names(edgy) <- c("V1", "V2", "weight")
edgy$weight <- as.numeric(edgy$weight)
head(edgy)


### Reduce edge weight...
mean(edgy$weight)
sd(edgy$weight)
weight_threshold <- mean(edgy$weight) + sd(edgy$weight) 
weight_threshold
TOPedgy <-  edgy[which(edgy$weight > weight_threshold),]
nrow(TOPedgy)
head(TOPedgy)
TOP_UNIs_net <- graph.data.frame(TOPedgy, directed=FALSE)


### Reducing unipartite network to largest connected component
cl <- clusters(BigNet_UNIs)
cl$membership
cl$csize
cl$no
MAXCLUSTERSIZE <- max(cl$csize)
MAXCLUSTERSIZE
small.clusters <- which(cl$csize < MAXCLUSTERSIZE )
small.clusters
vertices.to.delete <- which(cl$membership %in% small.clusters)
BigNet_UNIs_LCC <- delete_vertices(BigNet_UNIs, vertices.to.delete)
## now re-checking...
cl <- clusters(BigNet_UNIs_LCC)
cl$csize
cl$no

### Reducing TOP unipartite network to largest connected component
cl <- clusters(TOP_UNIs_net)
cl$membership
cl$csize
cl$no
MAXCLUSTERSIZE <- max(cl$csize)
MAXCLUSTERSIZE
small.clusters <- which(cl$csize < MAXCLUSTERSIZE )
small.clusters
vertices.to.delete <- which(cl$membership %in% small.clusters)
TOP_UNIs_net_LCC <- delete_vertices(TOP_UNIs_net, vertices.to.delete)
## now re-checking...
cl <- clusters(TOP_UNIs_net_LCC)
cl$csize
cl$no

table(coreness(BigNet_UNIs))
mean(coreness(BigNet_UNIs))

### Reducing to the 10core...
cored_BigNet_UNIs <- delete_vertices(BigNet_UNIs, coreness(BigNet_UNIs)<10)

# 
# ##### For the US-excluded sample, the top core is 6...
# cored_BigNet_UNIs <- delete_vertices(BigNet_UNIs, coreness(BigNet_UNIs)<6)

### Pretty colours for different parts of the network
colvec<-rep(rgb(150,2,0, 220, names=NULL, maxColorValue=255))  
colvec_edge<-rep(rgb(0,0,180, 70, names=NULL, maxColorValue=255))  
colvec_nodeframe<-rep(rgb(1,10,3, 20, names=NULL, maxColorValue=255))  
colvec_mostly_transparent<-rep(rgb(1,5,5, 5, names=NULL, maxColorValue=255))  

tkcores <- as.data.frame(table(coreness(BigNet_UNIs)))
head(tkcores)


### Scatterplot of results... fracture score..
 ggplot(tkcores) +
  geom_point(aes(Var1, Freq), size = 1.5, color='dark green')+ labs (x = "Kcore", y = "Number of Universities", title="Distribution of Kcore") +   theme_classic() 



pdf(paste("Different Plots of Uni Networks, including Figure14 at end", US_INCLUDED_OR_NOT, ".pdf"))
par(mai=c(.2,.2,.2,.2))
par(mfrow=c(1,1))
plot(BigNet_UNIs, vertex.size=.03, vertex.label.cex=.2, edge.width=E(BigNet_UNIs)$weight/10, edge.color=colvec_edge)
plot(BigNet_UNIs, vertex.size=.03, vertex.label.cex=.0002, edge.width=E(BigNet_UNIs)$weight/10, edge.color=colvec_edge)

plot(BigNet_UNIs_LCC, vertex.size=.03, vertex.label.cex=.2, edge.width=E(BigNet_UNIs_LCC)$weight/10, edge.color=colvec_edge)
plot(TOP_UNIs_net, vertex.size=.03, vertex.label.cex=.2, edge.width=E(TOP_UNIs_net)$weight/10, edge.color=colvec_edge)
plot(TOP_UNIs_net, vertex.size=.03, vertex.label.cex=.0002, edge.width=E(TOP_UNIs_net)$weight/10, edge.color=colvec_edge)
plot(TOP_UNIs_net_LCC, vertex.size=.1, vertex.label.cex=.4, edge.width=E(TOP_UNIs_net_LCC)$weight/10, edge.color=colvec_edge)

plot(cored_BigNet_UNIs, vertex.size=.5, vertex.label.cex=.8, edge.width=E(cored_BigNet_UNIs)$weight/3, edge.color=colvec_edge)

plot(cored_BigNet_UNIs, vertex.size=.5, vertex.label.cex=.4, edge.width=E(cored_BigNet_UNIs)$weight, edge.color=colvec_edge)

dev.off()



# Let's measure centralization, via degree, eigenvector, closeness
# Freeman, L.C. (1979). Centrality in Social Networks I: Conceptual Clarification. Social Networks 1, 215–239.
# Wasserman, S., and Faust, K. (1994). Social Network Analysis: Methods and Applications. Cambridge University Press.


# And let's calculate the power law exponent...
# M. E. J. Newman. 2005. Power laws, Pareto distributions and Zipf's law,  Contemporary Physics, 46, 323-351
# Aaron Clauset, Cosma R .Shalizi and Mark E.J. Newman: Power-law distributions in empirical data. SIAM Review 51(4):661-703, 2009.


#Cycle across all years to gather density scores#
net_stats_df <-data.frame(matrix(NA,1,2)) ### This sets up an empty dataframe 
names(net_stats_df)<-c("Measure", "BigNet_UNIs") ### This labels the columns
head(net_stats_df)

net_stats_df[1,1] <- "Nodes"
net_stats_df[2,1] <- "Edges"
net_stats_df[3,1] <- "Degree Centralization"
net_stats_df[4,1] <- "Eigen Centralization"
net_stats_df[5,1] <- "Power Law Exponent"
net_stats_df[6,1] <- "KS test statistic"

## Should show a blank dataframe...
View(net_stats_df)

### Smaller KS test statistic scores denote better fit with a power-law distribution
## Smaller KS p-values indicate that the test rejected the H that the original data could have been drawn from the fitted power-law distribution

### Disable scientific notion...
options(scipen = 999)

net_stats_df[1,2] <- ecount(BigNet_UNIs)
net_stats_df[2,2] <- vcount(BigNet_UNIs)
net_stats_df[3,2] <- centr_degree(BigNet_UNIs)$centralization
net_stats_df[4,2] <- centr_eigen(BigNet_UNIs)$centralization
net_stats_df[5,2] <- fit_power_law(strength(BigNet_UNIs, weights=E(BigNet_UNIs)$weight))$alpha
net_stats_df[6,2] <- fit_power_law(strength(BigNet_UNIs, weights=E(BigNet_UNIs)$weight))$KS.stat



net_stats_df$BigNet_UNIs <- round(net_stats_df$BigNet_UNIs , 2)
head(net_stats_df, 10)

### Print these results as a table...
# devtools::install_github("davidgohel/flextable")
library(flextable)
save_as_docx(flextable(net_stats_df), path="Net Stats - Full Sample.docx")
print(flextable(net_stats_df))
